金庸小说人物嵌入可视化分析
导入库¶
In [3]:
import os
import re
import jieba
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from gensim.models import Word2Vec
import plotly.express as px
import plotly.graph_objects as go
读取小说文本¶
In [5]:
# 读取所有小说文件
novel_dir = "./Jinyong_novels"
novel_files = [f for f in os.listdir(novel_dir) if f.endswith(".txt")]
corpus = []
novel_tags = []
for file in novel_files:
with open(os.path.join(novel_dir, file), "r", encoding="utf-8") as f:
text = f.read()
corpus.append(text)
novel_tags.append(file.replace(".txt", ""))
预处理与分词¶
In [7]:
# 读取停用词表
with open("stopwords_cn.txt", "r", encoding="utf-8") as f:
stopwords = set([line.strip() for line in f.readlines()])
# 使用结巴分词 + 停用词过滤
def preprocess(text):
words = jieba.lcut(text)
return [w for w in words if w.strip() and w not in stopwords and len(w) > 1]
tokenized_corpus = [preprocess(text) for text in corpus]
Building prefix dict from the default dictionary ... Loading model from cache C:\Users\XXX\AppData\Local\Temp\jieba.cache Loading model cost 0.825 seconds. Prefix dict has been built successfully.
训练 Word2Vec 模型¶
In [9]:
w2v_model = Word2Vec(
sentences=tokenized_corpus,
vector_size=100,
window=5,
min_count=5,
sg=1, # skip-gram
workers=4,
epochs=20
)
加载人物名并提取其词向量¶
In [11]:
# 加载人名词表
with open("./Jinyong_dic/人名.txt", "r", encoding="utf-8") as f:
name_list = [line.strip() for line in f.readlines() if line.strip()]
# 提取人物名的词向量及所在小说
name_vecs = []
name_labels = []
name_sources = []
for idx, novel in enumerate(tokenized_corpus):
novel_name = novel_tags[idx]
for name in name_list:
if name in novel and name in w2v_model.wv:
name_vecs.append(w2v_model.wv[name])
name_labels.append(name)
name_sources.append(novel_name)
# 去重
df = pd.DataFrame({"name": name_labels, "source": name_sources})
df["vec"] = name_vecs
df = df.drop_duplicates(subset=["name"])
PCA降维¶
In [13]:
# 构建矩阵并降维
vectors = np.stack(df["vec"].to_numpy())
pca_2d = PCA(n_components=2)
pca_3d = PCA(n_components=3)
df["x_2d"], df["y_2d"] = pca_2d.fit_transform(vectors).T
df["x_3d"], df["y_3d"], df["z_3d"] = pca_3d.fit_transform(vectors).T
二维交互式可视化¶
In [15]:
fig_2d = px.scatter(
df,
x="x_2d", y="y_2d",
color="source",
text="name",
title="金庸小说人物词向量的二维PCA可视化",
width=800,
height=600
)
fig_2d.update_traces(textposition='top center')
fig_2d.write_html("金庸小说人物词向量的二维PCA可视化.html")
fig_2d.show()
三维交互式可视化¶
In [17]:
fig_3d = px.scatter_3d(
df,
x="x_3d", y="y_3d", z="z_3d",
color="source",
text="name",
title="金庸小说人物词向量的三维PCA可视化",
width=900,
height=700
)
fig_3d.update_traces(marker=dict(size=5), textposition='top center')
fig_3d.write_html("金庸小说人物词向量的三维PCA可视化.html")
fig_3d.show()